The source R Markdown document is available here: Download identify_gene_promoter_regions.Rmd. # Load R packages
suppressMessages(library(tidyverse))
suppressMessages(library(AnnotationHub))
suppressMessages(library(rtracklayer))
suppressMessages(library(glue))
suppressMessages(library(biomaRt))
suppressMessages(library(RColorBrewer))
suppressMessages(library(parallel))
Select TSS with the highest FANTOM CAT transcription initiation evidence score (TIEScore).
# If the data directory doesn't already exist, create it
if(!dir.exists("data")){ dir.create(file.path("data"), showWarnings = FALSE) }
# Obtain FANTOM CAT robust annotation if not already done
if(!file.exists("data/FANTOM_CAT.lv3_robust.gtf.gz")){
download.file(url="http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv3_robust/FANTOM_CAT.lv3_robust.gtf.gz", destfile = "data/FANTOM_CAT.lv3_robust.gtf.gz")
}
# Select protein-coding tx TSS that has the greatest TIEScore support
# Transcription Initiation Evidence Score (TIEScore) is a custom metric produced for FANTOM CAT to quantify the likelihood that a CAGE transcription start site is genuine. We select the transcript with the highest TIEScore.
FANTOM_TSS <- as_tibble(readGFF("data/FANTOM_CAT.lv3_robust.gtf.gz")) %>%
group_by(gene_id) %>%
mutate(geneClass=unique(geneClass)[1], geneSuperClass=unique(geneSuperClass)[1], gene_name=unique(gene_name)[1]) %>% # add to all rows of gene annotation
mutate(number_entries = dplyr::n()) %>% # Should be at least three entries for a gene (gene + transcript + exon)
filter(number_entries>2) %>%
arrange(desc(TIEScore)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
filter(geneSuperClass=="all_mRNA" & str_detect(gene_id, "ENSG")) %>% # Exclude "CATG___" genes, which appear to be lncRNAs that have been marked as having coding potential
filter(type=="transcript" & seqid%in%c(glue('chr{1:22}'), glue('chr{c("X","Y","M")}'))) %>% # restrict to reference chromosomes
dplyr::select(seqid, start, end, strand, gene_id, gene_name, TIEScore) %>%
arrange(as.character(seqid), start) %>%
mutate(gene_id = str_replace_all(gene_id, "ENSGR", "ENSG")) %>% # changes 7 gene IDs from pseudoautosomal regions
mutate(end = case_when(strand=="+" ~ start, TRUE ~ end), # Change start and end coordinates from tx start/end to TSS
start = case_when(strand=="+" ~ start, TRUE ~ end),
ensembl_gene_id = str_split_fixed(gene_id,"[.]",2)[,1]) %>%
dplyr::select(-gene_id)
# What information do we have?
head(FANTOM_TSS)
# How many genes?
nrow(FANTOM_TSS)
## [1] 19002
#[1] 19002
# Get gene info from biomaRt
if(!file.exists("data/biomaRt_query.rds")){
gene_info <- getBM(attributes = c("ensembl_gene_id", "ensembl_gene_id_version", "percentage_gene_gc_content", "hgnc_id", "hgnc_symbol", "entrezgene_id"),
filters = "ensembl_gene_id",
values = FANTOM_TSS$ensembl_gene_id,
mart = useMart(biomart="ENSEMBL_MART_ENSEMBL", host="sep2019.archive.ensembl.org", path="/biomart/martservice", dataset="hsapiens_gene_ensembl"))
saveRDS(gene_info, "data/biomaRt_query.rds")
} else {
gene_info <- readRDS("data/biomaRt_query.rds")
}
# Sometimes more than one NCBI gene ID (entrezgene_id) is returned. In these cases, collapse to one row and join values separated by commas
gene_info <- gene_info %>%
group_by(ensembl_gene_id) %>%
summarise(ensembl_gene_id_version = paste(unique(ensembl_gene_id_version), collapse=", "),
percentage_gene_gc_content = paste(unique(percentage_gene_gc_content), collapse=", "),
hgnc_id = paste(unique(hgnc_id), collapse=", "),
hgnc_symbol = paste(unique(hgnc_symbol), collapse=", "),
entrezgene_id = paste(unique(entrezgene_id), collapse=", ")) %>%
ungroup()
# Note that some ENSEMBL gene IDs present in hg19 are no longer in the current version of ENSEMBL
dim(gene_info)
## [1] 18610 6
dim(FANTOM_TSS)
## [1] 19002 7
# Join by ensembl_gene_id
FANTOM_TSS <- merge(x = FANTOM_TSS, y = gene_info, by = "ensembl_gene_id", all.x = FALSE) %>%
arrange(as.character(seqid), start)
# Set column names
colnames(FANTOM_TSS) <- c("ensembl_gene_id", "TSS_chromosome", "TSS_start", "TSS_end", "TSS_strand","FANTOM_gene_name", "TIEScore", "ensembl_gene_id_version", "percentage_gene_gc_content", "HGNC_ID", "HGNC_Symbol", "NCBI_gene_ID")
# clean up
rm(gene_info)
Import DNase narrowPeak ranges from the Roadmap Epigenomics project.
# Get annotation hub
ahub <- AnnotationHub()
snapshotDate(ahub)
## [1] "2019-10-29"
# [1] '2019-10-29'
# Create 'promoter' GRanges - 5Kb interval centred on the TSS
gene_promoters <- promoters(makeGRangesFromDataFrame(FANTOM_TSS, keep.extra.columns = TRUE),
upstream = 2500, downstream = 2500)
# Extract Roadmap Epigenomics Project DNase narrowPeak records for 111 uniformly
# processed human epigenomes (53 have DNase-seq data available)
npDNase <- query(ahub, c("EpigenomeRoadMap", "Narrow DNasePeaks for consolidated epigenomes",
"macs2"))
# How many files returned?
length(npDNase)
## [1] 53
# Take a quick look
head(npDNase)
## AnnotationHub with 6 records
## # snapshotDate(): 2019-10-29
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: GRanges
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## # rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH29873"]]'
##
## title
## AH29873 | E003-DNase.macs2.narrowPeak.gz
## AH29903 | E004-DNase.macs2.narrowPeak.gz
## AH29924 | E005-DNase.macs2.narrowPeak.gz
## AH29949 | E006-DNase.macs2.narrowPeak.gz
## AH29972 | E007-DNase.macs2.narrowPeak.gz
## AH29995 | E008-DNase.macs2.narrowPeak.gz
# List all of the AnnotationHub identifiers for reproducibility
names(npDNase)
## [1] "AH29873" "AH29903" "AH29924" "AH29949" "AH29972" "AH29995" "AH30078"
## [8] "AH30129" "AH30138" "AH30177" "AH30190" "AH30199" "AH30274" "AH30283"
## [15] "AH30308" "AH30317" "AH30326" "AH30340" "AH30460" "AH30469" "AH30477"
## [22] "AH30485" "AH30494" "AH30503" "AH30512" "AH30528" "AH30537" "AH30546"
## [29] "AH30555" "AH30564" "AH30573" "AH30582" "AH30603" "AH30612" "AH30627"
## [36] "AH30688" "AH30720" "AH30743" "AH30755" "AH30767" "AH30779" "AH30791"
## [43] "AH30803" "AH30815" "AH30828" "AH30841" "AH30853" "AH30865" "AH30877"
## [50] "AH30890" "AH31947" "AH31961" "AH31988"
# Retrieve all of the narrowPeak DNase data as GRanges, keeping only the ranges
# that overlap our annotated promoters (promoter_hg19) Define GRanges to hold
# just the narrowPeak regions that overlap promoters
if (!file.exists("data/DNase_narrowPeak_overlap_promoters.rds")) {
np_promoter <- GRanges()
for (i in 1:length(npDNase)) {
# Import narrowPeak file
np <- npDNase[[i]]
# Add tissue info
np$tissue <- mcols(npDNase)$tags[[i]][8]
# Normalise scores to be between zero and one
scores <- np$pValue
scores <- scores - min(scores)
scores <- scores/max(scores)
np$norm_score <- scores
# Restrict to the subset of peaks that overlaps a protein-coding promoter region
# and add to np_promoter
np_promoter <- c(np_promoter, np[overlapsAny(np, gene_promoters)])
# updata
print(glue("Finished processing file {i} of {length(npDNase)}"))
}
saveRDS(np_promoter, "data/DNase_narrowPeak_overlap_promoters.rds")
rm(i, np_promoter, np)
}
# clean up
rm(ahub, npDNase)
Quick visual exploration.
# Import DNase narrowPeak data
np_promoter <- readRDS("data/DNase_narrowPeak_overlap_promoters.rds")
# Define function that accepts a gene ID and creates some simple plots of the promoter and nearby DNase narrowPeak ranges
plot_DNase <- function(gene_name, three_prime_width=2500, five_prime_width=2500){
# Get the promoter GRanges for the gene of interest
goi <- gene_promoters[gene_promoters$HGNC_Symbol==gene_name]
# Find DNase narrowPeaks that overlap the promoter
overlapping_narrowPeaks <- np_promoter[overlapsAny(np_promoter,goi)]
# Order GRanges by norm_score
overlapping_narrowPeaks <- overlapping_narrowPeaks[order(overlapping_narrowPeaks$norm_score, decreasing = TRUE)]
# Create vector of colour values, based on the narrowPeak$norm_score value
plot_cols <- colorRampPalette(brewer.pal(11,"Spectral"))(100)[as.numeric(cut(overlapping_narrowPeaks$norm_score,breaks = 100))]
## Plot the point-source called for each DNase narrowPeak range
# Create empty plot
par(mar=c(5.1,25,4.1,2.1)) # bottom, left, top and right margins
plot("",xlim=c(start(goi)-three_prime_width, end(goi)+five_prime_width), ylim=c(-2,length(overlapping_narrowPeaks)+1), ylab="", yaxt="n", xlab="chromosome position",
main=gene_name)
axis(side=2, labels = overlapping_narrowPeaks$tissue, at = 1:length(overlapping_narrowPeaks)+0.5,las=2)
rect(xleft = start(goi), ybottom = -2, xright = end(goi), ytop = 0, col = "red", border=NA)
# Plot points for point-source of each narrowPeak
for(i in 1:length(overlapping_narrowPeaks)){
points(start(overlapping_narrowPeaks[i]) + overlapping_narrowPeaks[i]$peak, i+0.5, pch=21, col="black", bg=plot_cols[i], cex=1.5)
}
# Add a horizontal dashed red line, indicating where the cumulative fraction of norm_score values across all narrowPeak ranges exceeds 0.75 (i.e. the normalised scores of the narrowPeak ranges below this line make up 75% of the total)
cum_sum_cutoff <- which(cumsum(overlapping_narrowPeaks$norm_score)/sum(overlapping_narrowPeaks$norm_score)>0.75)[1]
abline(h=cum_sum_cutoff, col="red", lty=2)
# Restrict the DNase narrowPeaks to those below the line
overlapping_narrowPeaks <- overlapping_narrowPeaks[1:cum_sum_cutoff]
# Restrict the narrowPeak ranges to just the point source
start(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks) + overlapping_narrowPeaks$peak
end(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks)
# Check whether the point sources are spread across a region >=50bp - otherwise a rolling mean can't be calculated
if(diff(range(start(overlapping_narrowPeaks)))>=50){
# Create a data frame to hold the rolling mean of the norm_score values using a 50bp sliding window
roll_mean_scores <- data.frame(pos = min(start(overlapping_narrowPeaks)):max(start(overlapping_narrowPeaks)),
score = 0)
# Add the norm-score values
for(i in 1:length(overlapping_narrowPeaks)){
position <- roll_mean_scores$pos==start(overlapping_narrowPeaks[i])
roll_mean_scores$score[position] <- roll_mean_scores$score[position] + overlapping_narrowPeaks[i]$norm_score
}
# Calculate the position where the rolling mean of norm_scores is greatest
roll_mean_peak <- roll_mean_scores$pos[1] + which.max(zoo::rollmean(x=roll_mean_scores$score, k=50))+25
# Add two blue lines to mark the the 50bp region around the roll_mean_peak
abline(v=roll_mean_peak-25, col="blue", lty=2);
abline(v=roll_mean_peak+25, col="blue", lty=2);
# Restrict the narrowPeaks point sources to those within the selected region (those peaks below the dashed red line and between the two solid blue lines)
overlapping_narrowPeaks <- overlapping_narrowPeaks[(start(overlapping_narrowPeaks) > roll_mean_peak-25) & (start(overlapping_narrowPeaks) < roll_mean_peak+25)]
}
# Calculate weighted mean position of the remaining peaks, using the norm_scores as the weights
centre_pos <- as.integer(round(weighted.mean(x = start(overlapping_narrowPeaks), w = overlapping_narrowPeaks$norm_score), 0))
# Mark the position of the DNase peak centre with a thick blue line
abline(v=centre_pos, col="blue", lwd=2)
par(mar=c(5.1, 4.1, 4.1, 2.1))
}
plot_DNase("PAX6")
plot_DNase("GAPDH")
plot_DNase("MMP3")
# Clean up
rm(plot_DNase)
Our approach of identifying the strongest DNase peak appears to be working fairly well for now. Proceed to apply the method to all genes, adding the position of the DNase peak to the FANTOM_TSS data frame.
# Define function to identify peaks
identify_DNase <- function(row_num){
# Get the promoter GRanges for the gene of interest
goi <- gene_promoters[row_num]
# Find DNase narrowPeaks that overlap the promoter
overlapping_narrowPeaks <- np_promoter[overlapsAny(np_promoter,goi)]
# Order GRanges by norm_score
overlapping_narrowPeaks <- overlapping_narrowPeaks[order(overlapping_narrowPeaks$norm_score, decreasing = TRUE)]
# Check that we have at least one peak to work with, otherwise there's nothing to be done, in which case return NA
if(length(overlapping_narrowPeaks)==0){return(NA)}
# identify where the cumulative fraction of norm_score values across all narrowPeak ranges exceeds 0.75 (i.e. the normalised scores of the narrowPeak ranges below this line make up 75% of the total)
cum_sum_cutoff <- which(cumsum(overlapping_narrowPeaks$norm_score)/sum(overlapping_narrowPeaks$norm_score)>0.75)[1]
# Restrict the DNase narrowPeaks
overlapping_narrowPeaks <- overlapping_narrowPeaks[1:cum_sum_cutoff]
# Restrict the narrowPeak ranges to just the point source
start(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks) + overlapping_narrowPeaks$peak
end(overlapping_narrowPeaks) <- start(overlapping_narrowPeaks)
# Check whether the point sources are spread across a region >=50bp - otherwise a rolling mean can't be calculated
if(diff(range(start(overlapping_narrowPeaks)))>=50){
# Create a data frame to hold the rolling mean of the norm_score values using a 50bp sliding window
roll_mean_scores <- data.frame(pos = min(start(overlapping_narrowPeaks)):max(start(overlapping_narrowPeaks)),
score = 0)
# Add the norm-score values
for(i in 1:length(overlapping_narrowPeaks)){
position <- roll_mean_scores$pos==start(overlapping_narrowPeaks[i])
roll_mean_scores$score[position] <- roll_mean_scores$score[position] + overlapping_narrowPeaks[i]$norm_score
}
# Calculate the position where the rolling mean of norm_scores is greatest
roll_mean_peak <- roll_mean_scores$pos[1] + which.max(zoo::rollmean(x=roll_mean_scores$score, k=50))+25
# Restrict the narrowPeaks point sources to those within 50bp of the peak
overlapping_narrowPeaks <- overlapping_narrowPeaks[(start(overlapping_narrowPeaks) > roll_mean_peak-25) & (start(overlapping_narrowPeaks) < roll_mean_peak+25)]
}
# Calculate weighted mean position of the remaining peaks, using the norm_scores as the weights
centre_pos <- as.integer(round(weighted.mean(x = start(overlapping_narrowPeaks), w = overlapping_narrowPeaks$norm_score), 0))
return(centre_pos)
}
# Apply function to identify dominant DNase peak for all genes
if(!file.exists("data/centre_pos.rds")){
detectCores()
centre_pos <- unlist(mclapply(1:length(gene_promoters), identify_DNase))
saveRDS(centre_pos, "data/centre_pos.rds")
} else {
centre_pos <- readRDS("data/centre_pos.rds")
}
if(!file.exists("data/TSS_DNase.rds")){
saveRDS(FANTOM_TSS, "data/TSS_DNase.rds")
} else {
FANTOM_TSS <- readRDS("data/TSS_DNase.rds")
}